import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# libraries for models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
# metrics evaluation libraries
from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve, RocCurveDisplay
project_data = pd.read_csv("data.csv")
project_data = project_data.drop(columns=["id"]) # droping unwanted columns
project_data.head()
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | sensor_id | sensor_name | sensor_data | biomarker_name | urine_biomarker_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 | NanoSensor1 | 15.5 | Nuclear Matrix Protein 22 (NMP22) | 20.3 |
| 1 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 | NanoSensor1 | 15.5 | Nuclear Matrix Protein 22 (NMP22) | 18.9 |
| 2 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 | NanoSensor1 | 15.5 | Nuclear Matrix Protein 22 (NMP22) | 22.1 |
| 3 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 | NanoSensor1 | 15.5 | Nuclear Matrix Protein 22 (NMP22) | 19.5 |
| 4 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 | NanoSensor1 | 15.5 | Nuclear Matrix Protein 22 (NMP22) | 45.2 |
5 rows × 36 columns
project_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 36 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 diagnosis 569 non-null object 1 radius_mean 569 non-null float64 2 texture_mean 569 non-null float64 3 perimeter_mean 569 non-null float64 4 area_mean 569 non-null float64 5 smoothness_mean 569 non-null float64 6 compactness_mean 569 non-null float64 7 concavity_mean 569 non-null float64 8 concave points_mean 569 non-null float64 9 symmetry_mean 569 non-null float64 10 fractal_dimension_mean 569 non-null float64 11 radius_se 569 non-null float64 12 texture_se 569 non-null float64 13 perimeter_se 569 non-null float64 14 area_se 569 non-null float64 15 smoothness_se 569 non-null float64 16 compactness_se 569 non-null float64 17 concavity_se 569 non-null float64 18 concave points_se 569 non-null float64 19 symmetry_se 569 non-null float64 20 fractal_dimension_se 569 non-null float64 21 radius_worst 569 non-null float64 22 texture_worst 569 non-null float64 23 perimeter_worst 569 non-null float64 24 area_worst 569 non-null float64 25 smoothness_worst 569 non-null float64 26 compactness_worst 569 non-null float64 27 concavity_worst 569 non-null float64 28 concave points_worst 569 non-null float64 29 symmetry_worst 569 non-null float64 30 fractal_dimension_worst 569 non-null float64 31 sensor_id 569 non-null int64 32 sensor_name 569 non-null object 33 sensor_data 569 non-null float64 34 biomarker_name 569 non-null object 35 urine_biomarker_value 569 non-null float64 dtypes: float64(32), int64(1), object(3) memory usage: 160.2+ KB
project_data.describe()
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | sensor_id | sensor_data | urine_biomarker_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
| mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 | 2.367311 | 36.987346 | 29.605800 |
| std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 | 0.847773 | 21.069023 | 13.561382 |
| min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 | 1.000000 | 3.800000 | 4.200000 |
| 25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 | 2.000000 | 15.500000 | 19.500000 |
| 50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 | 3.000000 | 42.200000 | 31.400000 |
| 75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 | 3.000000 | 53.700000 | 41.300000 |
| max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 | 3.000000 | 62.400000 | 55.600000 |
8 rows × 33 columns
project_data.shape
(569, 36)
project_data.columns
Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_name',
'sensor_data', 'biomarker_name', 'urine_biomarker_value'],
dtype='object')
project_data.isna().sum()
diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 sensor_id 0 sensor_name 0 sensor_data 0 biomarker_name 0 urine_biomarker_value 0 dtype: int64
plt.figure(figsize=(20,10))
sns.countplot(x=project_data["diagnosis"])
print(project_data["diagnosis"].value_counts())
diagnosis B 357 M 212 Name: count, dtype: int64
numeric_columns = project_data.select_dtypes(include=['float64', 'int64']).columns
corr = project_data[numeric_columns].corr()
plt.figure(figsize=(20, 10))
sns.heatmap(corr, annot=True, cmap="YlGnBu")
plt.show()
X_train=project_data.drop(columns=["diagnosis"])
y_train=project_data["diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)
print('Train dataset shape:',X_train.shape)
print('Test dataset shape', y_train.shape)
Train dataset shape: (455, 35) Test dataset shape (455,)
numeric_columns = X_train.select_dtypes(exclude='object').columns
print(numeric_columns)
print('*'*100)
categorical_columns = X_train.select_dtypes(include='object').columns
print(categorical_columns)
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
'urine_biomarker_value'],
dtype='object')
****************************************************************************************************
Index(['sensor_name', 'biomarker_name'], dtype='object')
numeric_features = Pipeline([
('handlingmissingvalues',SimpleImputer(strategy='median')),
('scaling',StandardScaler(with_mean=True))
])
print(numeric_features)
print('*'*100)
categorical_features = Pipeline([
('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
('encoding', OneHotEncoder()),
('scaling', StandardScaler(with_mean=False))
])
print(categorical_features)
processing = ColumnTransformer([
('numeric', numeric_features, numeric_columns),
('categorical', categorical_features, categorical_columns)
])
processing
Pipeline(steps=[('handlingmissingvalues', SimpleImputer(strategy='median')),
('scaling', StandardScaler())])
****************************************************************************************************
Pipeline(steps=[('handlingmissingvalues',
SimpleImputer(strategy='most_frequent')),
('encoding', OneHotEncoder()),
('scaling', StandardScaler(with_mean=False))])
ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('handlingmissingvalues',
SimpleImputer(strategy='median')),
('scaling',
StandardScaler())]),
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture...
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
'urine_biomarker_value'],
dtype='object')),
('categorical',
Pipeline(steps=[('handlingmissingvalues',
SimpleImputer(strategy='most_frequent')),
('encoding', OneHotEncoder()),
('scaling',
StandardScaler(with_mean=False))]),
Index(['sensor_name', 'biomarker_name'], dtype='object'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('handlingmissingvalues',
SimpleImputer(strategy='median')),
('scaling',
StandardScaler())]),
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture...
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
'urine_biomarker_value'],
dtype='object')),
('categorical',
Pipeline(steps=[('handlingmissingvalues',
SimpleImputer(strategy='most_frequent')),
('encoding', OneHotEncoder()),
('scaling',
StandardScaler(with_mean=False))]),
Index(['sensor_name', 'biomarker_name'], dtype='object'))])Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'sensor_id', 'sensor_data',
'urine_biomarker_value'],
dtype='object')SimpleImputer(strategy='median')
StandardScaler()
Index(['sensor_name', 'biomarker_name'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder()
StandardScaler(with_mean=False)
def prepare_confusion_matrix(algo, model):
print(algo)
plt.figure(figsize=(12,8))
pred = model.predict(X_test)
cm = confusion_matrix(y_test, pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)
plt.show()
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
def prepare_classification_report(algo, model):
print(algo+' Report :')
pred = model.predict(X_test)
print(classification_report(y_test, pred))
def prepare_roc_curve(algo, model):
print(algo)
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
curve = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
curve.plot()
plt.show()
algorithms = [('bagging classifier', BaggingClassifier()),
('KNN classifier', KNeighborsClassifier()),
('Random Forest calssifier', RandomForestClassifier()),
('Adaboost classifier', AdaBoostClassifier()),
('Gradientboot classifier',GradientBoostingClassifier()),
('MLP', MLPClassifier())
]
trained_models = []
model_and_score = {}
for index, tup in enumerate(algorithms):
model = prepare_model(tup[1])
model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
trained_models.append((tup[0],model))
print(model_and_score)
print(model_and_score)
{'bagging classifier': '98.24175824175823%', 'KNN classifier': '94.28571428571428%', 'Random Forest calssifier': '100.0%', 'Adaboost classifier': '97.8021978021978%', 'Gradientboot classifier': '100.0%', 'MLP': '94.5054945054945%'}
for index, tup in enumerate(trained_models):
prepare_confusion_matrix(tup[0], tup[1])
bagging classifier
KNN classifier
Random Forest calssifier
Adaboost classifier
Gradientboot classifier
MLP
for index, tup in enumerate(trained_models):
prepare_classification_report(tup[0], tup[1])
print("\n")
bagging classifier Report :
precision recall f1-score support
B 0.95 0.93 0.94 81
M 0.83 0.88 0.85 33
accuracy 0.91 114
macro avg 0.89 0.90 0.90 114
weighted avg 0.91 0.91 0.91 114
KNN classifier Report :
precision recall f1-score support
B 0.96 0.96 0.96 81
M 0.91 0.91 0.91 33
accuracy 0.95 114
macro avg 0.94 0.94 0.94 114
weighted avg 0.95 0.95 0.95 114
Random Forest calssifier Report :
precision recall f1-score support
B 0.99 0.95 0.97 81
M 0.89 0.97 0.93 33
accuracy 0.96 114
macro avg 0.94 0.96 0.95 114
weighted avg 0.96 0.96 0.96 114
Adaboost classifier Report :
precision recall f1-score support
B 0.99 0.94 0.96 81
M 0.86 0.97 0.91 33
accuracy 0.95 114
macro avg 0.93 0.95 0.94 114
weighted avg 0.95 0.95 0.95 114
Gradientboot classifier Report :
precision recall f1-score support
B 0.97 0.95 0.96 81
M 0.89 0.94 0.91 33
accuracy 0.95 114
macro avg 0.93 0.95 0.94 114
weighted avg 0.95 0.95 0.95 114
MLP Report :
precision recall f1-score support
B 0.99 0.95 0.97 81
M 0.89 0.97 0.93 33
accuracy 0.96 114
macro avg 0.94 0.96 0.95 114
weighted avg 0.96 0.96 0.96 114
le = LabelEncoder()
project_data['diagnosis_encoded'] = le.fit_transform(project_data['diagnosis'])
# Plot KDEs for numeric features by diagnosis
for column in numeric_columns:
plt.figure(figsize=(12, 8))
sns.kdeplot(data=project_data, x=column, hue='diagnosis', palette="crest", fill=True)
plt.title(f'Distribution of {column} by Diagnosis')
plt.show()
# Plotting Pie Chart for Diagnosis
plt.figure(figsize=(8, 8))
plt.pie(project_data['diagnosis'].value_counts(), labels=project_data['diagnosis'].value_counts().index, autopct='%1.1f%%', colors=['lightcoral', 'skyblue'])
plt.title('Distribution of Diagnosis')
plt.show()
print(len(X_train), len(Y_train_encoded))
272 272
df=pd.read_csv('data.csv')
df.head(10)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[16], line 1 ----> 1 df.head(10) NameError: name 'df' is not defined
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 sensor_id 569 non-null int64 33 sensor_name 569 non-null object 34 sensor_data 569 non-null float64 35 biomarker_name 569 non-null object 36 urine_biomarker_value 569 non-null float64 dtypes: float64(32), int64(2), object(3) memory usage: 164.6+ KB
df.isna().sum()
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 sensor_id 0 sensor_name 0 sensor_data 0 biomarker_name 0 urine_biomarker_value 0 dtype: int64
df.describe()
| id | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | sensor_id | sensor_data | urine_biomarker_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.690000e+02 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
| mean | 3.037183e+07 | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | ... | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 | 2.367311 | 36.987346 | 29.605800 |
| std | 1.250206e+08 | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | ... | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 | 0.847773 | 21.069023 | 13.561382 |
| min | 8.670000e+03 | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | ... | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 | 1.000000 | 3.800000 | 4.200000 |
| 25% | 8.692180e+05 | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | ... | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 | 2.000000 | 15.500000 | 19.500000 |
| 50% | 9.060240e+05 | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | ... | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 | 3.000000 | 42.200000 | 31.400000 |
| 75% | 8.813129e+06 | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | ... | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 | 3.000000 | 53.700000 | 41.300000 |
| max | 9.113205e+08 | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | ... | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 | 3.000000 | 62.400000 | 55.600000 |
8 rows × 34 columns
df = df.dropna(axis=1)
df.shape
(569, 37)
df['diagnosis'].value_counts()
diagnosis B 357 M 212 Name: count, dtype: int64
import seaborn as sns
lb=LabelEncoder()
plt.figure(figsize=(25,25))
<Figure size 2500x2500 with 0 Axes>
<Figure size 2500x2500 with 0 Axes>
sns.pairplot(df.iloc[:, 1:5], hue="diagnosis")
<seaborn.axisgrid.PairGrid at 0x28d8543f010>
X=df.iloc[:, 2:32].values
X
array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
1.189e-01],
[2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
8.902e-02],
[1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
8.758e-02],
...,
[1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
7.820e-02],
[2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
1.240e-01],
[7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
7.039e-02]])
y = df.iloc[:, 1].values
y
array(['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'M',
'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B',
'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M',
'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B',
'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M',
'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M',
'B', 'M', 'M', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'B', 'M',
'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'B', 'B', 'M', 'M', 'B',
'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M',
'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
'B', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'M', 'B',
'B', 'B', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'M', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B',
'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M',
'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'M',
'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B',
'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
'B', 'B', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'B', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'B'], dtype=object)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X_train = st.fit_transform(X_train)
X_test = st.fit_transform(X_test)
X_train.shape
(455, 30)
y_train.shape
(455,)
from sklearn.linear_model import LogisticRegression, LinearRegression
log = LogisticRegression()
log.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log.score(X_train, y_train)
0.989010989010989
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test, log.predict(X_test))
0.956140350877193
print(classification_report(y_test, log.predict(X_test)))
precision recall f1-score support
B 0.96 0.97 0.96 67
M 0.96 0.94 0.95 47
accuracy 0.96 114
macro avg 0.96 0.95 0.95 114
weighted avg 0.96 0.96 0.96 114
import pickle
pickle.dump(log, open("model.pkl", "wb"))
from sklearn.metrics import accuracy_score
accuracy_scor(y.y_pred)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 1 ----> 1 accuracy_scor(y.y_pred) NameError: name 'accuracy_scor' is not defined
accuracy_score(y.y_pred)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 1 ----> 1 accuracy_score(y.y_pred) NameError: name 'y' is not defined
from sklearn.metrics import accuracy_score
# Assuming y_true is your true labels and y_pred is your predicted labels
y_true = [1, 0, 1, 1, 0]
y_pred = [1, 0, 1, 0, 1]
# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
# Print or use the accuracy value as needed
print("Accuracy:", accuracy)
Accuracy: 0.6
df.head(1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[7], line 1 ----> 1 df.head(1) NameError: name 'df' is not defined
df.head(1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[8], line 1 ----> 1 df.head(1) NameError: name 'df' is not defined
import pandas as pd
# Assuming you have a DataFrame named df
data = {'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']}
df = pd.DataFrame(data)
# Display the first row using head()
df.head(1)
| Column1 | Column2 | |
|---|---|---|
| 0 | 1 | A |
df.head(5)
| Column1 | Column2 | |
|---|---|---|
| 0 | 1 | A |
| 1 | 2 | B |
| 2 | 3 | C |
df.head(10)
| Column1 | Column2 | |
|---|---|---|
| 0 | 1 | A |
| 1 | 2 | B |
| 2 | 3 | C |
import pickle
with open("data.pkl", "wb") as f:
pickle.dump(data, f)
import pickle
with open("data.pkl", "rb") as f:
loaded_data = pickle.load(f)
import pickle
# Load the trained model
with open("your_model.pkl", "rb") as model_file:
model = pickle.load(model_file)
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[23], line 4 1 import pickle 3 # Load the trained model ----> 4 with open("your_model.pkl", "rb") as model_file: 5 model = pickle.load(model_file) File ~\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:282, in _modified_open(file, *args, **kwargs) 275 if file in {0, 1, 2}: 276 raise ValueError( 277 f"IPython won't let you open fd={file} by default " 278 "as it is likely to crash IPython. If you know what you are doing, " 279 "you can use builtins' open." 280 ) --> 282 return io_open(file, *args, **kwargs) FileNotFoundError: [Errno 2] No such file or directory: 'your_model.pkl'
import pickle
import pandas as pd
# Load the trained model
with open("data.pkl", "rb") as model_file:
model = pickle.load(model_file)
# Load new data from the CSV file
new_data = pd.read_csv("data.csv")
# Prepare the new data (assuming you need to preprocess it)
# You might need to perform any preprocessing steps applied to the original training data
# Make predictions
predictions = sensor_data.predict(new_data)
# Print or analyze the predictions
print(predictions)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[25], line 15 9 new_data = pd.read_csv("data.csv") 11 # Prepare the new data (assuming you need to preprocess it) 12 # You might need to perform any preprocessing steps applied to the original training data 13 14 # Make predictions ---> 15 predictions = sensor_data.predict(new_data) 17 # Print or analyze the predictions 18 print(predictions) NameError: name 'sensor_data' is not defined
import pickle
import pandas as pd
# Load the trained model
with open("data.pkl", "rb") as model_file:
model = pickle.load(model_file)
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Assuming df is your DataFrame
# Replace this with your actual data loading and preprocessing steps
df = pd.read_csv("data.csv")
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Store accuracy and prediction data
model_data = {
'model': model,
'accuracy': accuracy,
'predictions': y_pred,
}
# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
pickle.dump(model_data, file)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[27], line 18 16 # Train the machine learning model 17 model = RandomForestClassifier() ---> 18 model.fit(X_train, y_train) 20 # Make predictions on the test set 21 y_pred = model.predict(X_test) File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight) 343 if issparse(y): 344 raise ValueError("sparse multilabel-indicator for y is not supported.") --> 345 X, y = self._validate_data( 346 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE 347 ) 348 if sample_weight is not None: 349 sample_weight = _check_sample_weight(sample_weight, X) File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 563 y = check_array(y, input_name="y", **check_y_params) 564 else: --> 565 X, y = check_X_y(X, y, **check_params) 566 out = X, y 568 if not no_val_X and check_params.get("ensure_2d", True): File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1101 estimator_name = _check_estimator_name(estimator) 1102 raise ValueError( 1103 f"{estimator_name} requires y to be passed, but the target y is None" 1104 ) -> 1106 X = check_array( 1107 X, 1108 accept_sparse=accept_sparse, 1109 accept_large_sparse=accept_large_sparse, 1110 dtype=dtype, 1111 order=order, 1112 copy=copy, 1113 force_all_finite=force_all_finite, 1114 ensure_2d=ensure_2d, 1115 allow_nd=allow_nd, 1116 ensure_min_samples=ensure_min_samples, 1117 ensure_min_features=ensure_min_features, 1118 estimator=estimator, 1119 input_name="X", 1120 ) 1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1124 check_consistent_length(X, y) File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 877 array = xp.astype(array, dtype, copy=False) 878 else: --> 879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 880 except ComplexWarning as complex_warning: 881 raise ValueError( 882 "Complex data not supported\n{}\n".format(array) 883 ) from complex_warning File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp) 182 xp, _ = get_namespace(array) 183 if xp.__name__ in {"numpy", "numpy.array_api"}: 184 # Use NumPy API to support order --> 185 array = numpy.asarray(array, order=order, dtype=dtype) 186 return xp.asarray(array, copy=copy) 187 else: File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype) 2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: -> 2070 return np.asarray(self._values, dtype=dtype) ValueError: could not convert string to float: 'NanoSensor11'
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
# Load your dataset (replace "your_data.csv" with your actual filename)
df = pd.read_csv("data.csv")
# Identify and separate categorical columns (assuming 'sensor_name' is categorical)
categorical_columns = ['sensor_name']
numeric_columns = df.columns.difference(categorical_columns)
# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df[numeric_columns], encoded_columns], axis=1)
# Split the data into features and target variable
X = df_encoded.drop('diagnosis', axis=1)
y = df['diagnosis']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Store accuracy and prediction data
model_data = {
'model': model,
'accuracy': accuracy,
'predictions': y_pred,
}
# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
pickle.dump(model_data, file)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[28], line 29 27 # Train the machine learning model 28 model = RandomForestClassifier() ---> 29 model.fit(X_train, y_train) 31 # Make predictions on the test set 32 y_pred = model.predict(X_test) File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight) 343 if issparse(y): 344 raise ValueError("sparse multilabel-indicator for y is not supported.") --> 345 X, y = self._validate_data( 346 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE 347 ) 348 if sample_weight is not None: 349 sample_weight = _check_sample_weight(sample_weight, X) File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 563 y = check_array(y, input_name="y", **check_y_params) 564 else: --> 565 X, y = check_X_y(X, y, **check_params) 566 out = X, y 568 if not no_val_X and check_params.get("ensure_2d", True): File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1101 estimator_name = _check_estimator_name(estimator) 1102 raise ValueError( 1103 f"{estimator_name} requires y to be passed, but the target y is None" 1104 ) -> 1106 X = check_array( 1107 X, 1108 accept_sparse=accept_sparse, 1109 accept_large_sparse=accept_large_sparse, 1110 dtype=dtype, 1111 order=order, 1112 copy=copy, 1113 force_all_finite=force_all_finite, 1114 ensure_2d=ensure_2d, 1115 allow_nd=allow_nd, 1116 ensure_min_samples=ensure_min_samples, 1117 ensure_min_features=ensure_min_features, 1118 estimator=estimator, 1119 input_name="X", 1120 ) 1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1124 check_consistent_length(X, y) File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 877 array = xp.astype(array, dtype, copy=False) 878 else: --> 879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 880 except ComplexWarning as complex_warning: 881 raise ValueError( 882 "Complex data not supported\n{}\n".format(array) 883 ) from complex_warning File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp) 182 xp, _ = get_namespace(array) 183 if xp.__name__ in {"numpy", "numpy.array_api"}: 184 # Use NumPy API to support order --> 185 array = numpy.asarray(array, order=order, dtype=dtype) 186 return xp.asarray(array, copy=copy) 187 else: File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype) 2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: -> 2070 return np.asarray(self._values, dtype=dtype) ValueError: could not convert string to float: 'Aquaporin-1 (AQP1)'
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
# Load your dataset (replace "your_data.csv" with your actual filename)
df = pd.read_csv("data.csv")
# Identify and separate categorical columns (assuming 'sensor_name' is categorical)
categorical_columns = ['sensor_name']
numeric_columns = df.columns.difference(categorical_columns)
# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns].astype(str)), columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df[numeric_columns], encoded_columns], axis=1)
# Split the data into features and target variable
X = df_encoded.drop('diagnosis', axis=1)
y = df['diagnosis']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Store accuracy and prediction data
model_data = {
'model': model,
'accuracy': accuracy,
'predictions': y_pred,
}
# Dump data into a pickle file
with open("model_and_data.pkl", "wb") as file:
pickle.dump(model_data, file)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[29], line 29 27 # Train the machine learning model 28 model = RandomForestClassifier() ---> 29 model.fit(X_train, y_train) 31 # Make predictions on the test set 32 y_pred = model.predict(X_test) File ~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:345, in BaseForest.fit(self, X, y, sample_weight) 343 if issparse(y): 344 raise ValueError("sparse multilabel-indicator for y is not supported.") --> 345 X, y = self._validate_data( 346 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE 347 ) 348 if sample_weight is not None: 349 sample_weight = _check_sample_weight(sample_weight, X) File ~\anaconda3\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 563 y = check_array(y, input_name="y", **check_y_params) 564 else: --> 565 X, y = check_X_y(X, y, **check_params) 566 out = X, y 568 if not no_val_X and check_params.get("ensure_2d", True): File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1101 estimator_name = _check_estimator_name(estimator) 1102 raise ValueError( 1103 f"{estimator_name} requires y to be passed, but the target y is None" 1104 ) -> 1106 X = check_array( 1107 X, 1108 accept_sparse=accept_sparse, 1109 accept_large_sparse=accept_large_sparse, 1110 dtype=dtype, 1111 order=order, 1112 copy=copy, 1113 force_all_finite=force_all_finite, 1114 ensure_2d=ensure_2d, 1115 allow_nd=allow_nd, 1116 ensure_min_samples=ensure_min_samples, 1117 ensure_min_features=ensure_min_features, 1118 estimator=estimator, 1119 input_name="X", 1120 ) 1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 1124 check_consistent_length(X, y) File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:879, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 877 array = xp.astype(array, dtype, copy=False) 878 else: --> 879 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 880 except ComplexWarning as complex_warning: 881 raise ValueError( 882 "Complex data not supported\n{}\n".format(array) 883 ) from complex_warning File ~\anaconda3\lib\site-packages\sklearn\utils\_array_api.py:185, in _asarray_with_order(array, dtype, order, copy, xp) 182 xp, _ = get_namespace(array) 183 if xp.__name__ in {"numpy", "numpy.array_api"}: 184 # Use NumPy API to support order --> 185 array = numpy.asarray(array, order=order, dtype=dtype) 186 return xp.asarray(array, copy=copy) 187 else: File ~\anaconda3\lib\site-packages\pandas\core\generic.py:2070, in NDFrame.__array__(self, dtype) 2069 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: -> 2070 return np.asarray(self._values, dtype=dtype) ValueError: could not convert string to float: 'Aquaporin-1 (AQP1)'
import pickle
# Your classification report data
classification_report_data = {
'accuracy': 0.96,
'precision': {
'B': 0.96,
'M': 0.96,
},
'recall': {
'B': 0.97,
'M': 0.94,
},
'f1-score': {
'B': 0.96,
'M': 0.95,
},
'support': {
'B': 67,
'M': 47,
},
}
# Set your accuracy threshold
accuracy_threshold = 0.95
# Check if there's cancer and if it exceeds the accuracy threshold
if classification_report_data['accuracy'] > accuracy_threshold:
print("There's cancer, and the accuracy exceeds the threshold.")
else:
print("There's cancer, but the accuracy doesn't exceed the threshold.")
# Save to pickle file
with open('classification_report.pkl', 'wb') as file:
pickle.dump(classification_report_data, file)
There's cancer, and the accuracy exceeds the threshold.
import pickle
# Load the pickle file
with open('C:/Users/karri/classification_report.pkl', 'rb') as file:
classification_report_data = pickle.load(file)
# Print the contents
print(classification_report_data)
{'accuracy': 0.96, 'precision': {'B': 0.96, 'M': 0.96}, 'recall': {'B': 0.97, 'M': 0.94}, 'f1-score': {'B': 0.96, 'M': 0.95}, 'support': {'B': 67, 'M': 47}}
from flask import Flask, render_template, request
import pickle
import matplotlib.pyplot as plt
from io import BytesIO
import base64
app = Flask(__name__)
# Load the model and threshold from the pickle file
with open('classification_report.pkl', 'rb') as file:
classification_report_data = pickle.load(file)
# Set your threshold
threshold = 0.95
def generate_plot(prediction):
# Replace this with your actual logic to generate the plot based on user input
# For simplicity, we'll create a bar chart with random data
labels = ['Precision', 'Recall', 'F1-Score']
values = [0.9, 0.8, 0.85] # Replace with your actual values
plt.bar(labels, values)
plt.title(f'Metrics for {prediction} class')
plt.xlabel('Metrics')
plt.ylabel('Values')
# Save the plot to a BytesIO object
image_stream = BytesIO()
plt.savefig(image_stream, format='png')
image_stream.seek(0)
plt.close()
# Convert the BytesIO object to a base64-encoded string
plot_url = base64.b64encode(image_stream.read()).decode('utf-8')
return f"data:image/png;base64,{plot_url}"
@app.route('/')
def home():
return render_template('index.html')
@app.route('/predict', methods=['POST'])
def predict():
if request.method == 'POST':
# Get user input from the form
# Assuming you have form fields like 'feature1', 'feature2', etc.
feature1 = float(request.form['feature1'])
feature2 = float(request.form['feature2'])
# Add more features as needed
# Make predictions based on the user input
# Replace this with your actual prediction logic using the loaded model
# For simplicity, we'll assume a random prediction here
prediction = 'B' # Replace with your actual prediction
# Check if the predicted probability exceeds the threshold
if classification_report_data['precision'][prediction] > threshold:
result = f"The model predicts there is cancer ({prediction})."
else:
result = f"The model predicts there is no cancer ({prediction})."
# Generate a plot based on user input
plot_url = generate_plot(prediction)
!pip install flask
Requirement already satisfied: flask in c:\users\karri\anaconda3\lib\site-packages (2.2.2) Requirement already satisfied: Werkzeug>=2.2.2 in c:\users\karri\anaconda3\lib\site-packages (from flask) (2.2.2) Requirement already satisfied: click>=8.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (8.0.4) Requirement already satisfied: Jinja2>=3.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (3.1.2) Requirement already satisfied: itsdangerous>=2.0 in c:\users\karri\anaconda3\lib\site-packages (from flask) (2.0.1) Requirement already satisfied: colorama in c:\users\karri\anaconda3\lib\site-packages (from click>=8.0->flask) (0.4.6) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\karri\anaconda3\lib\site-packages (from Jinja2>=3.0->flask) (2.1.1)